'''
File name: project.ipynb
Date created: 03/11/2019
Date last modified: 25/11/2019
Python Version: 3.7.4
''';
import pandas as pd
import numpy as np
import geopandas as gpd
import folium
import json
import math
import plotly
import plotly.express as px
import plotly.graph_objects as go
from utils import constants as cst
from utils import clean_database
from utils import areas_handler
from utils import chicago_map_handler as maps
# Set auto-reload
%load_ext autoreload
%autoreload 2
In this section we load and clean the databases. All methods used are described in utils/clean_database.
#Â Load the areas dataframe
areas_DF = gpd.read_file(cst.AREAS_PATH)
# Clean the dataframe
areas_DF = clean_database.clean_areas_df(areas_DF)
areas_DF.head()
# Load the food inspections dataframe
food_inspections_DF = pd.read_csv(cst.FOOD_INSPECTIONS_PATH, sep = ',', header = 0,
names = cst.FOOD_INSPECTIONS_COL_NAMES, index_col = None, error_bad_lines=False
)
# Clean the dataframe
food_inspections_DF = clean_database.clean_food_inspections_df(food_inspections_DF, areas_DF)
food_inspections_DF.head()
# Load the socio-economic indicators dataframe
socio_economic_DF = pd.read_csv(cst.SOCIO_ECONOMIC_INDICATORS_PATH, sep = ',', header = 0,
names = cst.SOCIO_ECONOMIC_COL_NAMES, index_col = None, error_bad_lines=False
)
# Clean the dataframe
socio_economic_DF = clean_database.clean_socio_economic_df(socio_economic_DF)
socio_economic_DF.head()
# Load the life expectancy dataframe
life_expectancy_DF = pd.read_csv(cst.LIFE_EXPECTANCY_PATH, sep = ',', header = 0,
names = cst.LIFE_EXPECTANCY_COL_NAMES, index_col = None, error_bad_lines=False
)
# Clean the dataframe
life_expectancy_DF = clean_database.clean_socio_economic_df(life_expectancy_DF)
life_expectancy_DF.head()
The solutions described are implemented in utils/areas_handler.
# Filter locations where lng/lat are unknown
food_unknown_loc = food_inspections_DF[food_inspections_DF['lat'].isna()]
# Get unknown locations
unknown_locations = areas_handler.get_unknown_locations(food_unknown_loc)
# Check the locations not found by OpenStreetMaps
unknown_locations[pd.isnull(unknown_locations['lat'])].head()
# Display the Chicago areas
# This map contains additional layers to visually check if the found locations are actually within the borders of the city
map_chicago = maps.create_chicago_map()
map_chicago = maps.add_locations(map_chicago, unknown_locations, food_inspections_DF)
map_chicago
Description of the map
On the map above we can see the city of Chicago and the region where the facilities of the inspections presented in the food_inspections dataset are located. In the following, we will focus ang get some insights on that area.
# Use unknonwn_locations to fill lat and lng in the original dataframe food_inspections_DF
food_unknown_loc = food_unknown_loc.reset_index().merge(unknown_locations, on="address", how='left') \
.set_index('index').drop(['lat_x', 'lng_x'], axis = 1) \
.rename(columns={'lat_y':'lat','lng_y':'lng'})
food_inspections_DF.update(food_unknown_loc)
# Check which latitudes are still unknown
print('%d food inspections still missing lat long info, out of %d. '%(food_inspections_DF.lat.isna().sum(), len(food_inspections_DF)))
So we have no more unknown locations in our food inspections dataframe.
# Resolve are numbers and delete unknown areas
# Takes a while if not already saved
food_inspections_DF = areas_handler.get_locations_with_area(food_inspections_DF, areas_DF)
print("Number of locations: " + str(food_inspections_DF.shape[0]))
# Drop locations not in the city of chicago
food_inspections_DF = food_inspections_DF.dropna(subset=[cst.AREA_NUM])
print("Number of locations in the city of chicago: " + str(food_inspections_DF.shape[0]))
food_inspections_DF[cst.AREA_NUM] = food_inspections_DF[cst.AREA_NUM].astype(int)
# Create new dataframe with number of inspections per area
inspection_counts = food_inspections_DF[cst.AREA_NUM].value_counts().to_frame()
inspection_counts.reset_index(level=0, inplace=True)
inspection_counts.columns=[cst.AREA_NUM,'num_inspections']
inspection_counts[cst.AREA_NUM] = inspection_counts[cst.AREA_NUM].astype(str)
inspection_counts.sort_values('num_inspections');
heatmap_chicago = maps.heat_map(inspection_counts, "Inspections", cst.AREA_NUM, 'num_inspections')
heatmap_chicago
Description of the map
The heatmap above shows the number of inspections on the area we are focusing on and determined before. We can see that the more we approach the city center of Chicago, the higher the number of inspections.
However, it is also curious to see that in the north of Chicago, the number of inspections is higher than in the south. This could be due to a higher amount of establishment in this area; we this check below.
Here we determine and analyse the number of establishments per region area.
count_DF = food_inspections_DF.copy()
count_DF = count_DF.drop_duplicates(subset=['license_num'])
count_ser = count_DF[cst.AREA_NUM].value_counts().to_frame()
count_ser.reset_index(level=0, inplace=True)
count_ser.columns=[cst.AREA_NUM,'num_establishments']
count_ser[cst.AREA_NUM] = count_ser[cst.AREA_NUM].astype(str)
count_ser.sort_values('num_establishments');
heatmap_chicago = maps.heat_map(count_ser, "Number of Establishments", cst.AREA_NUM, 'num_establishments')
heatmap_chicago
Description of the map
The heatmap above shows the number of establishments on the area we want to analyze. Same as before, we observe that the more we approach the city center or the north of the city, the higher the number of establishments. Indeed, the number of inspections is correlated with the number of establishments; the more establishments we have on a given area, the higher the inspections will be on that area.
We compute a "risk_ser" dataframe, which has two columns containing the area number and the risk level. To simplify the risk values, we convert the risk in the food inspections dataframe to an integer.
def risk_to_num(x):
if x == 'Risk 3 (Low)':
return 1
if x == 'Risk 2 (Medium)':
return 2
if x == 'Risk 1 (High)':
return 3
if x == 'All':
return 2
else:
return x
risk_DF = food_inspections_DF.copy()
risk_DF = risk_DF.dropna(subset=['risk'])
risk_DF['risk'] = risk_DF['risk'].apply(lambda x : risk_to_num(x)).astype(int)
risk_ser = risk_DF.groupby(cst.AREA_NUM)['risk'].mean().to_frame()
risk_ser.reset_index(level=0, inplace=True)
risk_ser[cst.AREA_NUM] = risk_ser[cst.AREA_NUM].astype(str)
heatmap_chicago = maps.heat_map(risk_ser, "Average risk", cst.AREA_NUM, 'risk')
heatmap_chicago
Description of the map
The heatmap above shows the average risk of the establishments for each community area. The risk of an establishment shows how it could potentially affect the public's health with 1 being the highest and 3 the lowest. Furthermore, high risk establishments are inspected more frequently that low risk establishments.
#inspections_per_est = inspection_counts.merge(count_ser, left_on=cst.AREA_NUM, right_on=cst.AREA_NUM).drop(['index'], axis = 1)
inspections_per_est = inspection_counts.merge(count_ser, left_on=cst.AREA_NUM, right_on=cst.AREA_NUM)
inspections_per_est['inspections_per_establishment'] = inspections_per_est.apply(lambda x : x.num_inspections/x.num_establishments, axis = 1)
heatmap_chicago = maps.heat_map(inspections_per_est, "Number of Inspections per Establishment", cst.AREA_NUM, 'inspections_per_establishment')
heatmap_chicago
Description of the map
The heatmap above shows the number of inspections per establishment and as highlighted before, we can see that the establishments with a high number of inspections present high average risks.
Each establishment can receive a violation number between 1-44 or 70 and the number is followed by a specific description of the findings that caused the violation to be issued. The higher the number, the better the description. Establishments collecting only high numbers might probably pass whereas the others might probably fail.
An inspection of an establishment can pass, pass with conditions or fail depending on these numbers. The 'pass' condition is given when the establishment were found to have no critical or serious violations. Establishments that received a 'pass with conditions' were found to have critical or serious violations but these violations were corrected during the inspection. Finally, the 'fail' condition is issued when the establishment were found to have critical or serious violations that were not correctable during the inspection.
We will analyse more deeply the violations for milestone 3.
# Merge socio-economic and life expectancy df's on the area number and names
socio_life_merged_DF = socio_economic_DF.merge(life_expectancy_DF, how="left", on=["community_area_num", "community_area_name"])
# Select two of the following columns for the heatmap
socio_life_merged_DF[cst.AREA_NUM] = socio_life_merged_DF[cst.AREA_NUM].astype(str)
for c in socio_life_merged_DF.columns:
print(c)
heatmap_chicago = maps.heat_map(socio_life_merged_DF, "Per capita income 2010",cst.AREA_NUM ,'per_capita_income', good_indicator = True)
heatmap_chicago
heatmap_chicago = maps.heat_map(socio_life_merged_DF, "Life expectancy 2010",cst.AREA_NUM ,'life_exp_2010', good_indicator = True)
heatmap_chicago
We could compare the map above, displaying the life expectancies, with the percentage of housholds below the poverty line.
heatmap_chicago = maps.heat_map(socio_life_merged_DF, "housholds below poverty line as percentage",cst.AREA_NUM ,'housholds_below_poverty_perc')
heatmap_chicago
In the first map, some community areas have no entry for their life expectancies. Clearly the life expectancies are higher near the center of Chicago, and on it's north side. These are also the regions with lowest amount of housholds below poverty line. Around West Garfield Park, the life expectancy drops drastically, as do the housholds above poverty line.
The center and north of Chicago mostly have higher life expectancies and lower percentages of households below the poverty line.
With respect to food inspections, these areas also have more restaurants, and more food inspections per restaurant than the south.
# Check: range, mean with confidence interval.
important_columns = ['community_area_num', 'community_area_name', 'housing_crowded_perc',
'housholds_below_poverty_perc', 'aged_16_or_more_unemployed_perc',
'aged_25_or_more_without_high_school_diploma_perc',
'aged_under_18_or_over_64_perc', 'per_capita_income', 'hardship_idx',
'life_exp_2010']
socio_life_merged_DF[important_columns].describe()
We can see above the mean, standart deviations and confidence intervals for some columns that highlight the socio economic factors of all areas. Of course in the database all community areas are considered equally, and so this does not take into account the sizes of the areas, or the number of people in it. However it is still interesting to analyse this result, to get some insight on what we are dealing with. For example it is surprising that the standart deviation of the income per capita is more than half of its mean! Or that the life expectancies in all areas have an std of more than 4 years.
In what follows we analyse more closely the correlations between the different features in this dataframe.
corr = socio_life_merged_DF[cst.SOCIOECONOMIC_METRICS].corr()
corr
We now want to make sure that the correlation between all two pairs of metrics both good or both bad is always negative, and that the correlation between 2 pairs of variables of different type is always negative. For this, we create the boolean variable sign_kept: only if the above condition is always met, then sign_kept will be true.
bad_metrics = set(['housing_crowded_perc', 'housholds_below_poverty_perc', 'aged_16_or_more_unemployed_perc',
'aged_25_or_more_without_high_school_diploma_perc', 'hardship_idx', 'aged_under_18_or_over_64_perc'])
good_metrics = set(['per_capita_income', 'life_exp_2010' ])
sign_kept = True
for c1 in cst.SOCIOECONOMIC_METRICS:
for c2 in cst.SOCIOECONOMIC_METRICS:
if (c1 in bad_metrics and c2 in bad_metrics) or (c1 in good_metrics and c2 in good_metrics):
if corr[c1][c2] < 0:
sign_kept = False
elif (c1 in bad_metrics and c2 in good_metrics) or (c1 in good_metrics and c2 in bad_metrics):
if corr[c1][c2] > 0:
sign_kept = False
if sign_kept:
print('The correlation between indicators both good or both bad is always positive,\n and the correlation between a good and a bad indicator is always negative')
# Set correlation between each variable and itself to None in order to ignore it later
for c in corr.columns:
corr[c][c] = None
corrmax =pd.DataFrame(corr.idxmax()).rename({0: 'Strongest positive correlation'}, axis = 1)
corrmax['Correlation value'] = corr.max()
corrmax
corrmin =pd.DataFrame(corr.idxmin()).rename({0: 'Strongest negative correlation'}, axis = 1)
corrmin['Correlation value'] = corr.min()
corrmin
Of the above correlations, we notice certain things: Firstly, we can classify the indicators between good (life expectancy and per capita income) and bad (percentage of crowded houses, percentage of below porverty households, percentage of over 16 unemployed people, percentage fo over 25 people without a high school diploma, the hardship index, and the percentage of people under 18 and over 64), and the correlations between indicators either both good or both bad will always be positive, whereas the correlation between a good and a bad indicator will always be negative (sign_kept).
We also notice that the percentage of people under 18 or over 64 is a strong negative indicator: it is more negatively correlated to per capita income than, for example, the percentage of houses living below the poverty line.
It is indeed quite surprising that per capita average income is not more correlated to the percentage of houses living below the poverty line (correlation is -0.56). We plot the 2 metrics in order to see this:
One reason the linear correlation is so low is that the relationship is exponential. However, we also notice that thetop 5 highest per capita neighbourhoods are not in the top 15 lowest poor households percentage.
We now plot a graph showing the per capita income on the y-axis and the housholds below poverty on the x-axis, of every community area. The plot is interactive, you can hover over a point to see its community area name and the precise values reported.
plotly.offline.init_notebook_mode(connected=True)
fig = px.scatter(socio_life_merged_DF, x = 'housholds_below_poverty_perc', y = 'per_capita_income', \
hover_data=['community_area_name'])
fig.update_layout(
xaxis_title="percentage of housholds below poverty",
yaxis_title="per capita income",
title=go.layout.Title(text="Community area per capita income with respect to housholds below poverty"),
)
fig.show()